package cn.doitedu.ml.tfidf

import cn.doitedu.commons.util.SparkUtil
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{HashingTF, IDF}

/**
 * @date: 2020/2/22
 * @site: www.doitedu.cn
 * @author: hunter.d 涛哥
 * @qq: 657270652
 * @description: 调用mllib库中已经写好的TF-IDF算法模型，来对文本进行tfidf特征向量化
 */
object TFIDF_Mllib {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkUtil.getSparkSession(this.getClass.getSimpleName)
    import spark.implicits._


    /**
      * docid,doc
      * 1,    a a a a a a x x y  ->[0,0,0,0,6,0,2,0,0,1,0,0]
      * 2,    b b b x y          ->
      * 3,    c c x y            ->
      * 4,    d x                ->
      */
    // 加载原始数据
    val df = spark.read.option("header","true").csv("userprofile/data/demo/tfidf/docs.txt")

    val wordsDF = df.selectExpr("docid","split(doc,' ') as words")


    // 将分词后的字段，变成hash映射TF值向量
    val tfUtil = new HashingTF()
      .setInputCol("words")
      .setOutputCol("tf_vec")
      .setNumFeatures(26)
    val tfDF = tfUtil.transform(wordsDF)
    tfDF.show(10,false)


    // 利用tf集合，算出idf向量，然后再用idf去加工原来的tf，得到tfidf
    val idfUtil = new IDF()
        .setInputCol("tf_vec")
        .setOutputCol("tfidf_vec")
    val model = idfUtil.fit(tfDF)
    val tfidf = model.transform(tfDF)
    tfidf.show(10,false)

    spark.close()

  }

}
